Импорт данных

In [3]:
data = pd.read_csv('data/processed/tep_data.csv', index_col='Index')
print(f'Len of dataset: {data.shape[0]}')
Len of dataset: 12801

Гиперпараметры модели

In [7]:
from src.models.autoencoder import build_autoencoder
import keras

create_params = dict(
    input_shape=window_length,
    hidden_layer_size=16,
    hidden_layer_activation=None,
    reg_strength=0.001,
    input_dropout=-1,
)

compile_params = dict(
    optimizer=keras.optimizers.Adam(lr=0.001),
    loss='mse',
)

fit_params = dict(
    batch_size=64,
    epochs=100,
    verbose=1,
    callbacks=[keras.callbacks.ReduceLROnPlateau(patience=5),
               keras.callbacks.EarlyStopping(min_delta=0.01, patience=15)]
)

model_fn = lambda: build_autoencoder(create_params, compile_params)
Using TensorFlow backend.

Обучение модели

Кросс-валидация

In [20]:
from src.utils import cross_validate

X_comp = rolling_window(X_one_component.to_numpy()[:, None], window_length=window_length).squeeze()

results = cross_validate(model_fn, 3, fit_params, X_comp, X_comp)
Fold 0...
WARNING:tensorflow:From C:\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:986: The name tf.assign_add is deprecated. Please use tf.compat.v1.assign_add instead.

WARNING:tensorflow:From C:\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:973: The name tf.assign is deprecated. Please use tf.compat.v1.assign instead.

Train on 3185 samples, validate on 3184 samples
Epoch 1/100
3185/3185 [==============================] - 1s 210us/step - loss: 1.0796 - val_loss: 0.6876
Epoch 2/100
3185/3185 [==============================] - 0s 41us/step - loss: 0.3806 - val_loss: 0.2924
Epoch 3/100
3185/3185 [==============================] - 0s 38us/step - loss: 0.2524 - val_loss: 0.2509
Epoch 4/100
3185/3185 [==============================] - 0s 24us/step - loss: 0.2305 - val_loss: 0.2321
Epoch 5/100
3185/3185 [==============================] - 0s 26us/step - loss: 0.2149 - val_loss: 0.2164
Epoch 6/100
3185/3185 [==============================] - 0s 25us/step - loss: 0.2015 - val_loss: 0.2031
Epoch 7/100
3185/3185 [==============================] - 0s 24us/step - loss: 0.1902 - val_loss: 0.1911
Epoch 8/100
3185/3185 [==============================] - 0s 29us/step - loss: 0.1801 - val_loss: 0.1811
Epoch 9/100
3185/3185 [==============================] - 0s 31us/step - loss: 0.1717 - val_loss: 0.1725
Epoch 10/100
3185/3185 [==============================] - 0s 31us/step - loss: 0.1645 - val_loss: 0.1654
Epoch 11/100
3185/3185 [==============================] - 0s 33us/step - loss: 0.1585 - val_loss: 0.1595
Epoch 12/100
3185/3185 [==============================] - 0s 33us/step - loss: 0.1534 - val_loss: 0.1544
Epoch 13/100
3185/3185 [==============================] - 0s 32us/step - loss: 0.1491 - val_loss: 0.1503
Epoch 14/100
3185/3185 [==============================] - 0s 30us/step - loss: 0.1455 - val_loss: 0.1468
Epoch 15/100
3185/3185 [==============================] - 0s 29us/step - loss: 0.1424 - val_loss: 0.1439
Epoch 16/100
3185/3185 [==============================] - 0s 27us/step - loss: 0.1397 - val_loss: 0.1413
Epoch 17/100
3185/3185 [==============================] - 0s 18us/step - loss: 0.1374 - val_loss: 0.1392
Epoch 18/100
3185/3185 [==============================] - 0s 24us/step - loss: 0.1354 - val_loss: 0.1371
Epoch 19/100
3185/3185 [==============================] - 0s 28us/step - loss: 0.1336 - val_loss: 0.1354
Epoch 20/100
3185/3185 [==============================] - 0s 29us/step - loss: 0.1321 - val_loss: 0.1340
Epoch 21/100
3185/3185 [==============================] - 0s 30us/step - loss: 0.1307 - val_loss: 0.1328
Epoch 22/100
3185/3185 [==============================] - 0s 30us/step - loss: 0.1296 - val_loss: 0.1317
Epoch 23/100
3185/3185 [==============================] - 0s 29us/step - loss: 0.1284 - val_loss: 0.1306
Epoch 24/100
3185/3185 [==============================] - 0s 32us/step - loss: 0.1274 - val_loss: 0.1296
Epoch 25/100
3185/3185 [==============================] - 0s 38us/step - loss: 0.1266 - val_loss: 0.1292
Epoch 26/100
3185/3185 [==============================] - 0s 36us/step - loss: 0.1259 - val_loss: 0.1281
Epoch 27/100
3185/3185 [==============================] - 0s 34us/step - loss: 0.1252 - val_loss: 0.1275
Epoch 28/100
3185/3185 [==============================] - 0s 32us/step - loss: 0.1246 - val_loss: 0.1268
Epoch 29/100
3185/3185 [==============================] - 0s 32us/step - loss: 0.1240 - val_loss: 0.1264
Epoch 30/100
3185/3185 [==============================] - 0s 28us/step - loss: 0.1235 - val_loss: 0.1258
Epoch 31/100
3185/3185 [==============================] - 0s 28us/step - loss: 0.1231 - val_loss: 0.1255
Epoch 32/100
3185/3185 [==============================] - 0s 28us/step - loss: 0.1226 - val_loss: 0.1252
Epoch 33/100
3185/3185 [==============================] - 0s 25us/step - loss: 0.1223 - val_loss: 0.1247
Epoch 34/100
3185/3185 [==============================] - 0s 27us/step - loss: 0.1219 - val_loss: 0.1243
Epoch 35/100
3185/3185 [==============================] - 0s 28us/step - loss: 0.1216 - val_loss: 0.1241
Epoch 36/100
3185/3185 [==============================] - 0s 24us/step - loss: 0.1213 - val_loss: 0.1237
Fold 1...
Train on 6369 samples, validate on 3184 samples
Epoch 1/100
6369/6369 [==============================] - 0s 53us/step - loss: 0.3681 - val_loss: 0.1769
Epoch 2/100
6369/6369 [==============================] - 0s 24us/step - loss: 0.1667 - val_loss: 0.1468
Epoch 3/100
6369/6369 [==============================] - 0s 15us/step - loss: 0.1458 - val_loss: 0.1348
Epoch 4/100
6369/6369 [==============================] - 0s 15us/step - loss: 0.1365 - val_loss: 0.1284
Epoch 5/100
6369/6369 [==============================] - 0s 15us/step - loss: 0.1313 - val_loss: 0.1245
Epoch 6/100
6369/6369 [==============================] - 0s 16us/step - loss: 0.1279 - val_loss: 0.1219
Epoch 7/100
6369/6369 [==============================] - 0s 19us/step - loss: 0.1257 - val_loss: 0.1201
Epoch 8/100
6369/6369 [==============================] - 0s 20us/step - loss: 0.1241 - val_loss: 0.1189
Epoch 9/100
6369/6369 [==============================] - 0s 20us/step - loss: 0.1229 - val_loss: 0.1178
Epoch 10/100
6369/6369 [==============================] - 0s 20us/step - loss: 0.1220 - val_loss: 0.1171
Epoch 11/100
6369/6369 [==============================] - 0s 21us/step - loss: 0.1213 - val_loss: 0.1165
Epoch 12/100
6369/6369 [==============================] - 0s 21us/step - loss: 0.1208 - val_loss: 0.1160
Epoch 13/100
6369/6369 [==============================] - 0s 19us/step - loss: 0.1203 - val_loss: 0.1157
Epoch 14/100
6369/6369 [==============================] - 0s 20us/step - loss: 0.1199 - val_loss: 0.1154
Epoch 15/100
6369/6369 [==============================] - 0s 19us/step - loss: 0.1196 - val_loss: 0.1152
Epoch 16/100
6369/6369 [==============================] - 0s 24us/step - loss: 0.1193 - val_loss: 0.1150
Epoch 17/100
6369/6369 [==============================] - 0s 24us/step - loss: 0.1192 - val_loss: 0.1149
Epoch 18/100
6369/6369 [==============================] - 0s 23us/step - loss: 0.1190 - val_loss: 0.1148
Epoch 19/100
6369/6369 [==============================] - 0s 24us/step - loss: 0.1188 - val_loss: 0.1146
Epoch 20/100
6369/6369 [==============================] - 0s 23us/step - loss: 0.1187 - val_loss: 0.1145
Fold 2...
Train on 9553 samples, validate on 3184 samples
Epoch 1/100
9553/9553 [==============================] - 1s 57us/step - loss: 0.2905 - val_loss: 0.1618
Epoch 2/100
9553/9553 [==============================] - 0s 28us/step - loss: 0.1459 - val_loss: 0.1371
Epoch 3/100
9553/9553 [==============================] - 0s 29us/step - loss: 0.1315 - val_loss: 0.1288
Epoch 4/100
9553/9553 [==============================] - 0s 28us/step - loss: 0.1256 - val_loss: 0.1248
Epoch 5/100
9553/9553 [==============================] - 0s 25us/step - loss: 0.1227 - val_loss: 0.1228
Epoch 6/100
9553/9553 [==============================] - 0s 21us/step - loss: 0.1210 - val_loss: 0.1214
Epoch 7/100
9553/9553 [==============================] - 0s 17us/step - loss: 0.1199 - val_loss: 0.1205
Epoch 8/100
9553/9553 [==============================] - 0s 17us/step - loss: 0.1192 - val_loss: 0.1200
Epoch 9/100
9553/9553 [==============================] - 0s 18us/step - loss: 0.1187 - val_loss: 0.1196
Epoch 10/100
9553/9553 [==============================] - 0s 18us/step - loss: 0.1183 - val_loss: 0.1192
Epoch 11/100
9553/9553 [==============================] - 0s 18us/step - loss: 0.1180 - val_loss: 0.1190
Epoch 12/100
9553/9553 [==============================] - 0s 17us/step - loss: 0.1177 - val_loss: 0.1187
Epoch 13/100
9553/9553 [==============================] - 0s 14us/step - loss: 0.1175 - val_loss: 0.1184
Epoch 14/100
9553/9553 [==============================] - 0s 14us/step - loss: 0.1173 - val_loss: 0.1183
Epoch 15/100
9553/9553 [==============================] - 0s 14us/step - loss: 0.1172 - val_loss: 0.1182
Epoch 16/100
9553/9553 [==============================] - 0s 14us/step - loss: 0.1171 - val_loss: 0.1181
Epoch 17/100
9553/9553 [==============================] - 0s 14us/step - loss: 0.1170 - val_loss: 0.1180
Epoch 18/100
9553/9553 [==============================] - 0s 14us/step - loss: 0.1169 - val_loss: 0.1181
Epoch 19/100
9553/9553 [==============================] - 0s 14us/step - loss: 0.1168 - val_loss: 0.1179
In [21]:
from src.visualization.visualize import visualize_cv_result
visualize_cv_result(results, f'Autoencoder loss')

Извлечём слой, который "эмбеддит"

In [26]:
import keras
from keras.layers import Input
from keras.models import Model, load_model

model = load_model('models/onedimensional_autoencoder.h5')

inp = Input(shape=(window_length,))
encoder_layer = model.layers[0]
encoded = encoder_layer(inp)

encoder = Model(inputs=inp, outputs=encoded)

Добавим аномалии в нормальные данные

In [22]:
from src.data.generate import generate_anomalies
data_with_anom = data[str(component)].copy()
anom_amount = 100
anom_idxs_start = np.random.choice(len(data_with_anom),
                                   anom_amount,
                                   replace=False)
anom_lens = np.zeros(anom_amount, dtype=int)
anom_idxs = np.zeros(len(data_with_anom), dtype=int)

for i, idx in enumerate(anom_idxs_start):
    l = np.random.randint(window_length, window_length * 2)

    if idx + l > data_with_anom.shape[0]:
        raise Exception('Try again, index out of range (FIXME THEN)')

    anom_lens[i] = l
    anoms = generate_anomalies(l)
    anom_idxs[idx:idx + l] = 1
    data_with_anom[idx:idx + l] = anoms[np.random.randint(len(anoms))]

Кластеризация эмбеддингов

KMeans

Найдём оптимальное значение количества кластеров по elbow

In [28]:
from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer

kmeans = KMeans()
visualizer = KElbowVisualizer(kmeans, k=(2,10), metric='distortion')
visualizer.fit(encoded)
visualizer.poof()
Out[28]:
<matplotlib.axes._subplots.AxesSubplot at 0x258fea18f60>
In [29]:
from yellowbrick.cluster import InterclusterDistance

km = KMeans(3)
intra_visualizer = InterclusterDistance(km)

intra_visualizer.fit(encoded)        # Fit the data to the visualizer
intra_visualizer.poof()        # Draw/show/poof the data
Out[29]:
<matplotlib.axes._subplots.AxesSubplot at 0x2588009a7f0>

Будем считать, что 0-ой и 2-ой кластеры полностью аномальные (выбрали наименьшие кластеры)

In [33]:
pred = km.predict(encoded)
maximal_cluster = pd.Series(pred).value_counts().idxmax()
anoms_pred = np.where(pred != maximal_cluster)[0]
In [35]:
from sklearn.metrics import recall_score, precision_score
from src.utils import intersection_over_true

recall = intersection_over_true(data.shape[0], anom_idxs_start, anom_lens, anoms_pred, window_length, recall_score)
precision = intersection_over_true(data.shape[0], anom_idxs_start, anom_lens, anoms_pred, window_length, precision_score)
print(f'Полнота={recall:.3f}, Точность={precision:.3f}')
Полнота=0.216, Точность=0.240
In [37]:
# Blue - anomalies predicted by model
# Red - anomalies that was not catched by model
# Green - original data, that is neither anomaly nor predicted as anomaly

show(p)
In [ ]: